Load data (list of articles) from a .csv file

dat <- read.csv(here("data", "20201215_EGM_Net_all-articles_clean.csv"))
dim(dat) #1019   16
## [1] 1019   16
#names(dat)
#hist(dat$year)

# #initial checks
# length(unique(dat$Title)) #1019
# title <- unique(dat$Title)
# length(unique(dat$DOI)) #only 184 DOI values and missing the value is stored as ""! (change to NA later?)
# table(dat$Item_Type) #mostly articles
# hist(dat$Pub_Year, breaks = 70)
# table(dat$Manual_Tags) #stored as vectors of characters, tags separated by "; "
# #table(dat$Item_Type)["journalArticle"] # 959 journal articles

Query for LENS API using article titles

DO NOT RUN - the outpt was saved to a file at the end of this chunk - this output is loaded at the start of the next chunk, to save long waiting time during LENS API query.

#subset of first 50  publications, (used for testing)
#dat50 <- dat[1:50, ] # use insted of "dat" in the code below

#Note:   if more than 50 run  receiving message code: 1 rate_limited Too many requests. Allowed '50' per minute. 429 ", the LENS API call loop was changed to accommodate this by adding waiting time between iterations (works slower but returns all available records)

#custom function for accessing LENS API
getLENSData <- function(token, query){
  url <- 'https://api.lens.org/scholarly/search'
  headers <- c('Authorization' = token, 'Content-Type' = 'application/json')
  httr::POST(url = url, add_headers(.headers=headers), body = query)
}

#prepare request
max_results <- 500 #o limit number of records per request, max currently allowed is now 500
article_list <- dat$Title #use dat50$Titkle for testing
article_title <- article_list[1] #extract the first title from the list
readRenviron("~/.Renviron")
token <- Sys.getenv("LENS_TOKEN") #access token saved in the local environment

#initial query with the first title only
request <- paste0('{"query":  {"match_phrase": {"title": "', paste0('', article_title,'"'),'}}, "size": "',max_results,'","scroll": "1m"}') #, "include": ["lens_id", "authors", "publication_type", "title"] #restrict to certain fields
data <- getLENSData(token, request)
record_json <- content(data, "text")
record_list <- jsonlite::fromJSON(record_json) #convert json output from article search to list
#names(record_list)
#str(record_list$data)
print("iteration = 1")
record_df <- data.frame(record_list) #convert it into a data frame
dim(record_df)
#names(record_df)
#record_df$data.title
#record_df$data.authors
#total <- record_list[["total"]] #1

#use a loop to run query for remaining titles and add them to a data frame of records one by one
for (i in 2:length(article_list)){ 
  article_title <- article_list[i] #extract the title from the list
  request <- paste0('{"query":  {"match_phrase": {"title": "', paste0('', article_title,'"'),'}}, "size": "',max_results,'","scroll": "1m"}') #, "include": ["lens_id", "authors", "publication_type", "title"] #restrict to certain fields
  data <- getLENSData(token, request)
  print(paste("iteration = ",i))
  record_json <- content(data, "text")
  record_list <- jsonlite::fromJSON(record_json) # convert json output from article search to list
  ifelse(record_list$total==0, 
           print(paste0("record not found for: ", article_title)),
            {
            new_df <- data.frame(record_list)
            record_df <- dplyr::bind_rows(record_df, new_df) # bind the latest search data frame to the previous data frame
            }
          )
  Sys.sleep(2.5) #make it slow down to not exceed 50 calls per min
  }


dim(record_df) #1073 out of 1019 - some titles matched multiple records
names(record_df) 
length(unique(record_df$data.title)) #1073 out of 1019 - some titles matched multiple records, 45 not found
#View(cbind(article_list, record_df$data.title)) #shifted positions!

class(record_df) #data frame, but it has nested lists, etc.

save(record_df, file = here("data", "LENS_dataframe.RData")) #save LENS output as a Rdata object

Information about LENS output values: https://docs.api.lens.org/response-scholar.html

Process LENS output

#start from loading save output from LENS
load("./data/LENS_dataframe.RData") #loads record_df data object

names(record_df)
##  [1] "scroll_id"                           "total"                              
##  [3] "data.lens_id"                        "data.title"                         
##  [5] "data.publication_type"               "data.year_published"                
##  [7] "data.date_published_parts"           "data.created"                       
##  [9] "data.external_ids"                   "data.authors"                       
## [11] "data.source"                         "data.fields_of_study"               
## [13] "data.volume"                         "data.issue"                         
## [15] "data.languages"                      "data.references"                    
## [17] "data.source_urls"                    "data.abstract"                      
## [19] "data.references_count"               "data.scholarly_citations_count"     
## [21] "data.start_page"                     "data.end_page"                      
## [23] "data.scholarly_citations"            "data.author_count"                  
## [25] "results"                             "data.date_published"                
## [27] "data.keywords"                       "data.publication_supplementary_type"
## [29] "data.mesh_terms"                     "data.chemicals"                     
## [31] "data.open_access"                    "data.funding"                       
## [33] "data.is_open_access"                 "data.conference"
dim(record_df)
## [1] 1073   34
length(unique(record_df$data.title)) #974 unique titles
## [1] 974
#View(record_df[duplicated(record_df$data.title) | duplicated(record_df$data.title, fromLast=TRUE), ]) #visual check - some records have more info than others
sum(sapply(record_df$data.fields_of_study, is.null)) #number of records without fields_of_study, also likely to have other missing data
## [1] 129
record_df$data.has_fields_of_study <- sapply(record_df$data.fields_of_study, is.null)

record_df %>% arrange(data.has_fields_of_study, data.title) %>% distinct(data.title, .keep_all = TRUE) -> record_df_unique #place the records without fields_of_study at the end and remove duplicates dim(record_df_unique) #check dimensions
record_df <- record_df_unique #reassign
dim(record_df) #974
## [1] 974  35
par(mar=c(4,4,2,2))
hist(record_df$data.year_published, main = "Publication year")

par(mar=c(4,15,2,2))
barplot(sort(table(record_df$data.publication_type)), horiz=TRUE, las=1, xlab = "Count", main = "Publication type")

par(mar=c(4,18,2,2))
barplot(sort(sort(table(record_df$data.source$title),decreasing = TRUE)[1:10]), horiz=TRUE, las=1, xlab = "Count", main = "Top10 publication sources (journals)")

par(mar=c(4,10,2,2))
barplot(sort(sort(table(unlist(record_df$data.fields_of_study)),decreasing = TRUE)[1:10]), horiz=TRUE, las=1, xlab = "Count", main = "Top10 fields of study")

#record_df$data.fields_of_study[[1]][1] #first value from every list for first paper
#record_df$data.fields_of_study[1] #first value from every list for every paper

par(mar=c(4,5,2,2))
barplot(table(!is.na(record_df$data.scholarly_citations_count)), horiz=TRUE, las=1, xlab = "Count", main = "Has count of citations?") #NA should be probably 0

hist(record_df$data.scholarly_citations_count, main = "Number of citations", breaks=100) #ignoring NA

# length(unlist(record_df$data.scholarly_citations)) #45097 total citation LENS ids

par(mar=c(4,5,2,2))
barplot(table(!is.na(record_df$data.references_count)), horiz=TRUE, las=1, xlab = "Count", main = "Has references?")

# length(unlist(record_df$data.references)) #30659 total reference LENS ids

doi <- unlist(lapply(record_df$data.external_ids, function(ch) expss::vlookup('doi', ch, result_column = 'value', lookup_column = 'type'))) #extracting doi for each article

par(mar=c(4,5,2,2))
barplot(table(!is.na(doi)), horiz=TRUE, las=1, xlab = "Count", main = "Has doi?")

Extract authors data.

#unnest first level of lists for data.authors
record_df_data.authors <- record_df %>% select(data.lens_id, data.title, data.publication_type, data.year_published, data.authors) %>% unnest(data.authors)
#str(record_df_data.authors) #a tibble with some lists of data frames for ids and affiliations
names(record_df_data.authors)
## [1] "data.lens_id"          "data.title"            "data.publication_type"
## [4] "data.year_published"   "first_name"            "last_name"            
## [7] "initials"              "ids"                   "affiliations"
#unnest author ids from record_df_data.authors:
record_df_data.authors.ids <- record_df_data.authors %>% select(data.lens_id, data.title, data.publication_type, data.year_published, first_name, last_name, initials, ids) %>% unnest(ids)
dim(record_df_data.authors.ids) #4052
## [1] 4052    9
#names(record_df_data.authors.ids) #a tibble with unnested ids use for igraph: data.lens_id value

record_df_data.authors.ids$Author <- paste(record_df_data.authors.ids$last_name, record_df_data.authors.ids$initials, sep=", ") #add a new column with author name made of las_name and initials

## check overlaps and inconsistencies:
#record_df_data.authors.ids %>% count(Author, type, value, sort = FALSE) %>% View # some authors appear multiple times when multiple ids types are available for them (e.g.     Aavik, T, Park, KJ, Anderson, SH) - selectively remove orcid (as less common).
#record_df_data.authors.ids %>% count(type, Author, value, sort = FALSE) %>% View  # some authors with multiple ids of the same type (e.g. Bigler, F, Brandle, JR, Bright, JA) 
#record_df_data.authors.ids %>% count(value, type, Author, sort = FALSE) %>% View # some "multiple" authors per id

#View(arrange(record_df_data.authors.ids[duplicated(record_df_data.authors.ids$value) | duplicated(record_df_data.authors.ids$value, fromLast=TRUE), ], value)) #visual check - some records have more info than others

value_check_df <- arrange(record_df_data.authors.ids[duplicated(record_df_data.authors.ids$value) | duplicated(record_df_data.authors.ids$value, fromLast=TRUE), ], value)

value_check_df$value_Author <- paste(value_check_df$type, value_check_df$value, value_check_df$Author, sep="-")
#View(duplicated(distinct(value_check_df, value_Author, .keep_all = TRUE)), ) #654 rows to be checked!

value_check_df %>% 
  group_by(value) %>% 
  filter(n()>1) %>% 
  distinct(value_Author, .keep_all = TRUE) -> value_check_df2
names(value_check_df2)
##  [1] "data.lens_id"          "data.title"            "data.publication_type"
##  [4] "data.year_published"   "first_name"            "last_name"            
##  [7] "initials"              "type"                  "value"                
## [10] "Author"                "value_Author"
check_values <- value_check_df2$value[duplicated(value_check_df2$value)] #doi to check

##substitute values with correct author first name and initials:

#1 
#orcid 0000-0001-5002-106X "Reberg-Horton, SC" to Reberg-Horton, C
#record_df_data.authors.ids[record_df_data.authors.ids$value == check_values[1], c("value", "Author", "first_name", "initials")]
#View(record_df_data.authors.ids[record_df_data.authors.ids$last_name == "Reberg-Horton", ]) #one orcid, 3 different magid! - remove all rows with magid, make first name Chris and initials C
record_df_data.authors.ids <- subset(record_df_data.authors.ids, last_name != "Reberg-Horton" | type != "magid")
dim(record_df_data.authors.ids) #remove all records with magid
## [1] 4047   10
record_df_data.authors.ids$first_name[record_df_data.authors.ids$value == "0000-0001-5002-106X" & record_df_data.authors.ids$last_name == "Reberg-Horton"] <- "Chris" 
record_df_data.authors.ids$initials[record_df_data.authors.ids$value == "0000-0001-5002-106X" & record_df_data.authors.ids$Author == "Reberg-Horton, SC"] <- "C" 
record_df_data.authors.ids$Author[record_df_data.authors.ids$value == "0000-0001-5002-106X" & record_df_data.authors.ids$Author == "Reberg-Horton, SC"] <- "Reberg-Horton, C" 

#2
#orcid 0000-0001-5069-0204 "Baker, M" to "Baker, ME" :
#record_df_data.authors.ids[record_df_data.authors.ids$value == check_values[2], c("value", "Author", "first_name", "initials")]
#record_df_data.authors.ids[record_df_data.authors.ids$value == "0000-0001-5069-0204", ]
record_df_data.authors.ids$first_name[record_df_data.authors.ids$value == "0000-0001-5069-0204" & record_df_data.authors.ids$Author == "Baker, M"] <- "Matthew E." 
record_df_data.authors.ids$initials[record_df_data.authors.ids$value == "0000-0001-5069-0204" & record_df_data.authors.ids$Author == "Baker, M"] <- "ME" 
record_df_data.authors.ids$Author[record_df_data.authors.ids$value == "0000-0001-5069-0204" & record_df_data.authors.ids$Author == "Baker, M"] <- "Baker, ME" 

#3
#orcid 0000-0001-6431-9959 "Pywell, R" to "Pywell, RF"
#record_df_data.authors.ids[record_df_data.authors.ids$value == check_values[3], c("value", "Author", "first_name", "initials")]
record_df_data.authors.ids$first_name[record_df_data.authors.ids$value == "0000-0001-6431-9959" & record_df_data.authors.ids$Author == "Pywell, R"] <- "Richard F." 
record_df_data.authors.ids$initials[record_df_data.authors.ids$value == "0000-0001-6431-9959" & record_df_data.authors.ids$Author == "Pywell, R"] <- "RF" 
record_df_data.authors.ids$Author[record_df_data.authors.ids$value == "0000-0001-6431-9959" & record_df_data.authors.ids$Author == "Pywell, R"] <- "Pywell, RF" 

#4
#orcid 0000-0001-9558-0586  "Marshall, E" to "Marshall, EJP"
#record_df_data.authors.ids[record_df_data.authors.ids$value == check_values[4], c("value", "Author", "first_name", "initials")]
record_df_data.authors.ids$first_name[record_df_data.authors.ids$value == "0000-0001-9558-0586" & record_df_data.authors.ids$Author == "Marshall, E"] <- "E. J. P." 
record_df_data.authors.ids$initials[record_df_data.authors.ids$value == "0000-0001-9558-0586" & record_df_data.authors.ids$Author == "Marshall, E"] <- "EJP" 
record_df_data.authors.ids$Author[record_df_data.authors.ids$value == "0000-0001-9558-0586" & record_df_data.authors.ids$Author == "Marshall, E"] <- "Marshall, EJP" 

#5
#orcid 0000-0002-1800-4558 -    Alain   Butet- fix orcid 0000-0002-9173-3466
#orcid 0000-0002-1800-4558 -    Agnes   Fargue-Lelievre - fix orcid 0000-0002-0426-8931 and first name to Agnes
#record_df_data.authors.ids[record_df_data.authors.ids$value == check_values[5], c("value", "Author", "first_name", "initials")]
record_df_data.authors.ids$value[record_df_data.authors.ids$value == "0000-0002-1800-4558" & record_df_data.authors.ids$Author == "Butet, A"] <- "0000-0002-9173-3466" 
record_df_data.authors.ids$first_name[record_df_data.authors.ids$value == "0000-0002-1800-4558" & record_df_data.authors.ids$first_name == "A."] <- "Agnes" 
record_df_data.authors.ids$value[record_df_data.authors.ids$value == "0000-0002-1800-4558" & record_df_data.authors.ids$Author == "Fargue-Lelièvre, A"] <- "0000-0002-0426-8931" 

#6 
#orcid 0000-0002-4202-2043 fix "NORRDAHL, K" "K" to "Norrdahl, K" "Kai"
#record_df_data.authors.ids[record_df_data.authors.ids$value == check_values[6], c("value", "Author", "first_name", "initials", "last_name")]
record_df_data.authors.ids$first_name[record_df_data.authors.ids$value == "0000-0002-4202-2043" & record_df_data.authors.ids$Author == "NORRDAHL, K"] <- "Kai" 
record_df_data.authors.ids$last_name[record_df_data.authors.ids$value == "0000-0002-4202-2043" & record_df_data.authors.ids$Author == "NORRDAHL, K"] <- "Norrdahl" 
record_df_data.authors.ids$Author[record_df_data.authors.ids$value == "0000-0002-4202-2043" & record_df_data.authors.ids$Author == "NORRDAHL, K"] <- "Norrdahl, K" 

#7
#orcid 0000-0003-0300-9951 "Woodcock, B"  "B.A." B" to "Woodcock, BA" "Ben A."
#record_df_data.authors.ids[record_df_data.authors.ids$value == check_values[7], c("value", "Author", "first_name", "initials")]
record_df_data.authors.ids$first_name[record_df_data.authors.ids$value == "0000-0003-0300-9951" & record_df_data.authors.ids$Author == "Woodcock, B"] <- "Ben A." 
record_df_data.authors.ids$initials[record_df_data.authors.ids$value == "0000-0003-0300-9951" & record_df_data.authors.ids$Author == "Woodcock, B"] <- "BA" 
record_df_data.authors.ids$Author[record_df_data.authors.ids$value == "0000-0003-0300-9951" & record_df_data.authors.ids$Author == "Woodcock, B"] <- "Woodcock, BA" 

#8
#orcid 0000-0003-1416-6047 "Bourn, NAD" to "Bourn, NA"
#record_df_data.authors.ids[record_df_data.authors.ids$value == check_values[8], c("value", "Author", "first_name", "initials")]
record_df_data.authors.ids$first_name[record_df_data.authors.ids$value == "0000-0003-1416-6047" & record_df_data.authors.ids$Author == "Bourn, NAD"] <- "Nigel A.D." 
record_df_data.authors.ids$initials[record_df_data.authors.ids$value == "0000-0003-1416-6047" & record_df_data.authors.ids$Author == "Bourn, NAD"] <- "NA" 
record_df_data.authors.ids$Author[record_df_data.authors.ids$value == "0000-0003-1416-6047" & record_df_data.authors.ids$Author == "Bourn, NAD"] <- "Bourn, NA" 

#9
#orcid 0000-0003-3616-5563 "FINN, J" to "Finn, JA"
#record_df_data.authors.ids[record_df_data.authors.ids$value == check_values[9], c("value", "Author", "first_name", "initials")]
record_df_data.authors.ids$first_name[record_df_data.authors.ids$value == "0000-0003-3616-5563" & record_df_data.authors.ids$Author == "FINN, J"] <- "John A." 
record_df_data.authors.ids$initials[record_df_data.authors.ids$value == "0000-0003-3616-5563" & record_df_data.authors.ids$Author == "FINN, J"] <- "JA" 
record_df_data.authors.ids$last_name[record_df_data.authors.ids$value == "0000-0003-3616-5563" & record_df_data.authors.ids$Author == "FINN, J"] <- "Finn" 
record_df_data.authors.ids$Author[record_df_data.authors.ids$value == "0000-0003-3616-5563" & record_df_data.authors.ids$Author == "FINN, J"] <- "Finn, JA" 

#10
#orcid 0000-0003-3742-7035 "Holland, J" "J.M." to "Holland, JM" "John M."
#record_df_data.authors.ids[record_df_data.authors.ids$value == check_values[10], c("value", "Author", "first_name", "initials")]
record_df_data.authors.ids$first_name[record_df_data.authors.ids$value == "0000-0003-3742-7035" & record_df_data.authors.ids$Author == "Holland, J"] <- "John M." 
record_df_data.authors.ids$initials[record_df_data.authors.ids$value == "0000-0003-3742-7035" & record_df_data.authors.ids$Author == "Holland, J"] <- "JM" 
record_df_data.authors.ids$Author[record_df_data.authors.ids$value == "0000-0003-3742-7035" & record_df_data.authors.ids$Author == "Holland, J"] <- "Holland, JM" 

#11
#orcid 000-0003-4225-9451 "Rahman, M" to "Rahman, MM"
#record_df_data.authors.ids[record_df_data.authors.ids$value == check_values[11], c("value", "Author", "first_name", "initials")]
record_df_data.authors.ids$first_name[record_df_data.authors.ids$value == "0000-0003-4225-9451" & record_df_data.authors.ids$Author == "Rahman, M"] <- "Mizanur Md." 
record_df_data.authors.ids$initials[record_df_data.authors.ids$value == "0000-0003-4225-9451" & record_df_data.authors.ids$Author == "Rahman, M"] <- "MM" 
record_df_data.authors.ids$Author[record_df_data.authors.ids$value == "0000-0003-4225-9451" & record_df_data.authors.ids$Author == "Rahman, M"] <- "Rahman, MM" 

#12
#orcid 0000-0003-4382-7051 "Sparks, T"  "T.H" to "Sparks, TH" "Tim H."
#record_df_data.authors.ids[record_df_data.authors.ids$value == check_values[12], c("value", "Author", "first_name", "initials")]
record_df_data.authors.ids$first_name[record_df_data.authors.ids$value == "0000-0003-4382-7051" & record_df_data.authors.ids$Author == "Sparks, T"] <- "Tim H." 
record_df_data.authors.ids$initials[record_df_data.authors.ids$value == "0000-0003-4382-7051" & record_df_data.authors.ids$Author == "Sparks, T"] <- "TH" 
record_df_data.authors.ids$Author[record_df_data.authors.ids$value == "0000-0003-4382-7051" & record_df_data.authors.ids$Author == "Sparks, T"] <- "Sparks, TH" 

# Some of these will need to be manually resolved - e.g. checking if these are really different people

record_df_data.authors.ids <- subset(record_df_data.authors.ids, last_name != "Reberg-Horton" | type != "magid")
dim(record_df_data.authors.ids) #remove all records with magid
## [1] 4047   10
#check ids
table(is.na(record_df_data.authors.ids$type)) #no missing values
## 
## FALSE 
##  4047
table(is.na(record_df_data.authors.ids$value)) #no missing values
## 
## FALSE 
##  4047
table(record_df_data.authors.ids$type)
## 
## magid orcid 
##  3468   579
#clean by removing orcid if magid is available
record_df_data.authors.ids %>%
    group_by(data.title, Author) %>%
    filter(type==min(type)) -> record_df_data.authors.ids
table(record_df_data.authors.ids$type) #3482 - magid, 23 - orcid 
## 
## magid orcid 
##  3468    11
#check how many authors with same name but different ids
record_df_data.authors.ids %>%
    group_by(Author) %>%
    summarise(count = n_distinct(value)) %>%
    filter(count > 1) -> check_next

dim(check_next) #135 Authors with multiple id values - further checking needed, e.g.:
## [1] 134   2
#record_df_data.authors.ids[record_df_data.authors.ids$Author == "Anderson, SH",] #same as below
#record_df_data.authors.ids[record_df_data.authors.ids$Author == as.character(check_next[1,"Author"]), ]

####### finished cleaning ids data frame for now


#unnest author affiliations from record_df_data.authors:
record_df_data.authors.aff <- record_df_data.authors %>% select(data.lens_id, data.title, data.publication_type, data.year_published, first_name, last_name, initials, affiliations) %>% unnest(affiliations)
dim(record_df_data.authors.aff) #2968 - some articles have missing data, esp country ID 
## [1] 2968   10
#names(record_df_data.authors.aff) #a tibble with unnested ids use for igraph: data.lens_id value
table(is.na(record_df_data.authors.aff$country_code)) #missing country ID - may need to impute - try https://www.grid.ac/ Global Research Identifier Database (GRID) with name (affiliation name)
## 
## FALSE  TRUE 
##  2083   885
####### do affiliation cleaning later

Processing summary:
- Author data: partially cleaned, a bit more to do.
- Affiliation data: not bad, but needs cleaning and imputation of missing values.

Use igraph to create collaboration networks for authors

#prepare data frame for igraph
dt <- data.frame(pub.id = record_df_data.authors.ids$data.lens_id, Author = record_df_data.authors.ids$Author, value = record_df_data.authors.ids$value)
str(dt)
## 'data.frame':    3479 obs. of  3 variables:
##  $ pub.id: chr  "091-265-753-392-171" "091-265-753-392-171" "091-265-753-392-171" "091-265-753-392-171" ...
##  $ Author: chr  "MacLeod, A" "Wratten, SD" "Sotherton, NW" "Thomas, MB" ...
##  $ value : chr  "2293860217" "2083225834" "2702469488" "2135687599" ...
dt %>%
  inner_join(dt, by = "pub.id") %>%
  filter(value.x < value.y) %>%
  count(Author.x, Author.y) %>%
  graph_from_data_frame(directed = FALSE) -> g1 #with author names as vertices

# dt %>%
#   inner_join(dt, by = "pub.id") %>%
#   filter(Author.x < Author.y) %>%
#   count(value.x, value.y) %>%
#   graph_from_data_frame(directed = FALSE) -> g1 #with ids as vertices

#see components of the igraph object
# E(g1)
# V(g1)
# g1[]
# edge_attr(g1)
# vertex_attr(g1)
# as_data_frame(g1, what = "edges")

#plot(g1) #basic plot - not readable
plot(g1, edge.arrow.size=0, vertex.color="gold", vertex.size=5, 
     vertex.frame.color="gray", vertex.label.color="black", 
     vertex.label.cex=0.8, vertex.label.dist=2, edge.curved=0.2, vertex.label=NA)

g1s <- simplify( g1, remove.multiple = T, remove.loops = T, 
                 edge.attr.comb=list(weight="sum", "ignore") ) #simplify by removing loops and merging overlapping edges
plot(g1s, edge.arrow.size=0, vertex.color="gold", vertex.size=5, vertex.frame.color="gray", 
     vertex.label.color="black", vertex.label.cex=0.8, vertex.label.dist=2, edge.curved=0.2, vertex.label=NA)

E(g1s)$weight <- 1 #add weights of 1 to each connection
E(g1s)$width <- E(g1s)$weight*2 #add weights to edges

#grep("^layout_", ls("package:igraph"), value=TRUE)[-1]  #list of available layouts 
plot(g1s, layout=layout_nicely, edge.arrow.size=0.0, vertex.color="gold", vertex.size=2,
     vertex.frame.color="gray", vertex.label.color="black", vertex.label=NA) # plotting with specific layout and without node labels

##Find cliques (complete subgraphs of an undirected graph)
# g1s_cliques <- cliques(as.undirected(g1s), min=10)# list of cliques       
# hist(sapply(g1s_cliques, length)) # histogram of clique sizes
# g1s_cliques_largest <- largest_cliques(as.undirected(g1s)) # cliques with max number of nodes
# vcol <- rep("grey80", vcount(as.undirected(g1s)))
# vcol[unlist(g1s_cliques_largest)] <- "gold"
# plot(as.undirected(g1s), layout=layout_nicely, vertex.color=vcol, vertex.size=2, vertex.label=NA)

##Community detection based on edge betweenness (Newman-Girvan)
##High-betweenness edges are removed sequentially (recalculating at each step) and the best partitioning of the network is selected.
ceb <- cluster_edge_betweenness(as.undirected(g1s)) 
length(ceb) #281 communities
## [1] 274
#membership(ceb) #authors and community number
modularity(ceb) #high modularity
## [1] 0.9661821
plot(ceb, as.undirected(g1s), layout=layout_nicely, vertex.size=2, vertex.label=NA)

#Community detection based on based on propagating labels
clp <- cluster_label_prop(g1s)
plot(clp, g1s, layout=layout_nicely, edge.arrow.size=0.0, vertex.size=2, vertex.label=NA)

#Community detection based on greedy optimization of modularity
cfg <- cluster_fast_greedy(as.undirected(g1s))
plot(cfg, as.undirected(g1s), layout=layout_nicely, edge.arrow.size=0.0, vertex.size=2, vertex.label=NA)

length(cfg) #280 communities
## [1] 273
modularity(cfg) #high modularity
## [1] 0.9630548
#order(sizes(cfg), decreasing=TRUE)
order(sizes(cfg), decreasing=TRUE)[1] # largest community nr2
## [1] 2
members_2 <- membership(cfg)[membership(cfg)==2] #members of the community nr2
str(members_2)
##  Named num [1:133] 2 2 2 2 2 2 2 2 2 2 ...
##  - attr(*, "names")= chr [1:133] "Ahmed, SI" "Al-wadaey, A" "Arora, K" "Baker, JL" ...
names(members_2)
##   [1] "Ahmed, SI"            "Al-wadaey, A"         "Arora, K"            
##   [4] "Baker, JL"            "Bharati, L"           "Blanco-Canqui, H"    
##   [7] "Boldt, AL"            "Bragan, RJ"           "Brandle, JR"         
##  [10] "Burkart, MR"          "Burras, CL"           "Cambardella, CA"     
##  [13] "Castellano, MJ"       "Chendev, YG"          "Colletti, JP"        
##  [16] "Cox, R"               "Cruse, RM"            "Danielson, SD"       
##  [19] "Dosskey, MG"          "Durso, LM"            "Eghball, B"          
##  [22] "Eigenberg, RA"        "Eisenhauer, DE"       "Fischer, JR"         
##  [25] "Franti, TG"           "García, VJ"           "Gennadiev, AN"       
##  [28] "Gilley, JE"           "Grala, RK"            "Harrell, MO"         
##  [31] "Helmers, MJ"          "Hernandez-Santana, V" "Hernandez, G"        
##  [34] "Hoagland, KD"         "Hodges, L"            "Hoffman, DW"         
##  [37] "Hubbard, KG"          "Iqbal, J"             "Isenhart, TM"        
##  [40] "James, DE"            "Jasa, PJ"             "Jaynes, DB"          
##  [43] "Kaspar, TC"           "Kellerman, T"         "Kelly, JM"           
##  [46] "Kim, DG"              "Kolka, R"             "Kolka, RK"           
##  [49] "Kovar, JL"            "Krajewski, WF"        "Kramer, LA"          
##  [52] "Lamb, SJ"             "Li, X"                "Liebman, M"          
##  [55] "Logsdon, SD"          "Loynachan, TE"        "Mabry, CM"           
##  [58] "Marx, DB"             "McCullough, MC"       "McInnes, KJ"         
##  [61] "Mickelson, SK"        "Mize, CW"             "Moore, LAS"          
##  [64] "Moorman, TB"          "Mueller, TG"          "Neelakantan, S"      
##  [67] "O'Neal, ME"           "Ogbuehi, SN"          "Parker, DB"          
##  [70] "Parkin, TB"           "Pérez-Suárez, M"      "Peters, CJ"          
##  [73] "Petin, AN"            "Petina, VI"           "Pfeiffer, RL"        
##  [76] "Powers, W"            "Quist, MC"            "Rienzi, E"           
##  [79] "Russell, JR"          "Sauer, TJ"            "Schaefer, AJ"        
##  [82] "Schilling, KE"        "Schmitt, TJ"          "Schroeder, PD"       
##  [85] "Schulte, LA"          "Schultz, RC"          "Senseman, SA"        
##  [88] "Simpkins, WW"         "Smith, TE"            "Snow, DD"            
##  [91] "Sokolowsky, R"        "Soni, B"              "Starr, JL"           
##  [94] "Stewart, TW"          "Thompson, ML"         "Tierney, DP"         
##  [97] "Tomer, MD"            "Trabue, S"            "Tufekcioglu, A"      
## [100] "Webber, DF"           "Wigen, SL"            "Woodbury, BL"        
## [103] "Wortmann, CS"         "Wright, RJ"           "Yamada, T"           
## [106] "Zaimes, GN"           "Zazdravnykh, EA"      "Zhang, H"            
## [109] "Zhou, X"              "Zuberer, DA"          "Shapiro, CA"         
## [112] "Lee, K-"              "Dix, ME"              "Meyer, GE"           
## [115] "Srinivas, P"          "Suratman, MN"         "Novykh, LL"          
## [118] "Márquez, CO"          "Asbjornsen, H"        "Ghaffarzadeh, M"     
## [121] "Hessel, R"            "Dosskey, M"           "Bartelt-Hunt, SL"    
## [124] "Tyndall, JC"          "Chen, B"              "Krutz, LJ"           
## [127] "Berges, SA"           "Collettil, JP"        "Kim, D"              
## [130] "Knight, KW"           "Lee, KH"              "Raich, JW"           
## [133] "Hirsh, SM"
#remove all nodes not from community 2,  by name
g1s2 <- delete_vertices(g1s, names(membership(cfg)[membership(cfg)!=2]))
plot(g1s2, layout=layout_nicely, edge.arrow.size=0, vertex.color="gold", vertex.size=5, vertex.frame.color="gray", 
     vertex.label.color="black", vertex.label.cex=0.8, vertex.label.dist=2, edge.curved=0.2) #plot of largest community (nr2)

#plot_dendrogram(ceb, mode="hclust") #dendogram plot - too dense!

#more at: https://kateto.net/netscix2016.html

Co-authorship networks: a few big clusters exist - get their affiliatins and key works, based on the cluster membership.

Use igraph to create collaboration networks for institutions

Note: neds to be redone after cleaning and imputing affiliation data.

#using country code and institution grid.id combined as a unique identifier
record_df_data.authors.aff$country_grid.id <- paste(record_df_data.authors.aff$country_code, record_df_data.authors.aff$grid_id, sep=", ")
record_df_data.authors.aff$Author <- paste(record_df_data.authors.aff$last_name, record_df_data.authors.aff$initials, sep=", ")

#prepare data frame for igraph
dti <- data.frame(pub.id = record_df_data.authors.aff$data.lens_id, Author = record_df_data.authors.aff$Author, value = record_df_data.authors.aff$country_grid.id)
str(dti)
## 'data.frame':    2968 obs. of  3 variables:
##  $ pub.id: chr  "091-265-753-392-171" "091-265-753-392-171" "091-265-753-392-171" "091-265-753-392-171" ...
##  $ Author: chr  "MacLeod, A" "Wratten, SD" "Wratten, SD" "Sotherton, NW" ...
##  $ value : chr  "GB, grid.5491.9" "GB, grid.5491.9" "NA, NA" "NA, NA" ...
dti %>%
  inner_join(dti, by = "pub.id") %>%
  filter(Author.x < Author.y) %>%
  count(value.x, value.y) %>%
  graph_from_data_frame(directed = FALSE) -> g1i

#as_data_frame(g1i, what = "edges")

# plot(g1i)
# plot(g1i, edge.arrow.size=0, vertex.color="gold", vertex.size=5, 
#      vertex.frame.color="gray", vertex.label.color="black", 
#      vertex.label.cex=0.8, vertex.label.dist=2, edge.curved=0.2)

#E(g1i)$weight <- 1 #add weights
g1is <- simplify(g1i, remove.multiple = T, remove.loops = T, 
                 edge.attr.comb=list(weight="sum", "ignore") ) #simplify
plot(g1is, layout=layout_nicely, edge.arrow.size=0, vertex.color="gold", vertex.size=5, vertex.frame.color="gray", 
     vertex.label.color="black", vertex.label.cex=0.8, vertex.label.dist=2, edge.curved=0.2)

#E(g1is)
#V(g1is)

##Find cliques (complete subgraphs of an undirected graph)
# cliques(as.undirected(g1is)) # list of cliques       
# sapply(cliques(as.undirected(g1is)), length) # clique sizes
# largest_cliques(as.undirected(g1is)) # clique with max number of nodes
# vcol <- rep("grey80", vcount(as.undirected(g1is)))
# vcol[unlist(largest_cliques(as.undirected(g1is)))] <- "gold"
# plot(as.undirected(g1is), vertex.label=V(g1is)$name, vertex.color=vcol)

##Community detection based on edge betweenness (Newman-Girvan)
##High-betweenness edges are removed sequentially (recalculating at each step) and the best partitioning of the network is selected.

cebi <- cluster_edge_betweenness(as.undirected(g1is)) 
#dendPlot(cebi, mode="hclust") #too dense

plot(cebi, as.undirected(g1is), layout=layout_nicely, edge.arrow.size=0.0, vertex.color="gold", vertex.size=2,
     vertex.frame.color="grey", vertex.label.color="black", vertex.label=NA)

#more at: https://kateto.net/netscix2016.html, pretty_plots.R
#text cleaning for word analysis: https://lukesingham.com/how-to-make-a-word-cloud-using-r/

To do

  • Author data needs a bit more cleaning - ids and affiliations
  • Create co-citation and bibliographic coupling networks
  • Plan figures for the manuscript